In [1]:
import scanpy as sc
import copy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import PolygonSelector
from matplotlib.path import Path
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
In [1]:
import numpy as np
mel = np.load('/QRISdata/Q1851/STOmics_SAW_Outputs/lncRNA/Melanoma.npy')
mel
Out[1]:
array([[ 2380, 11100],
[ 2400, 6960],
[ 2400, 10260],
...,
[10120, 18980],
[10140, 18920],
[10140, 18960]], dtype=uint32)
In [27]:
import scanpy as sc
lnc_bin20 = sc.read_h5ad("lnc_bin20_sc_processed_leiden.h5ad")
#IBD_bin20 = sc.read_h5ad("/QRISdata/Q1851/STOmics_SAW_Outputs/IBD/visualization/C05096G2.bin20_1.0.h5ad")
lnc_bin20.var['ENS']=lnc_bin20.var.index
lnc_bin20.var.index=lnc_bin20.var['real_gene_name']
lnc_bin20.raw.var['ENS']=lnc_bin20.raw.var.index
lnc_bin20.raw.var.index=lnc_bin20.raw.var['real_gene_name']
lnc_bin20.var_names = lnc_bin20.var['real_gene_name']
lnc_bin20
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:887: UserWarning:
AnnData expects .var.index to contain strings, but got values like:
['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']
Inferred to be: categorical
names = self._prep_dim_index(names, "var")
Out[27]:
AnnData object with n_obs × n_vars = 512623 × 206923
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'leiden_res_0.01', 'leiden_res_0.10', 'leiden_res_0.50'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden', 'leiden_resolution', 'log1p', 'merged', 'neighbors', 'omics', 'pca', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'umap'
obsm: 'X_pca', 'X_umap', 'spatial'
varm: 'PCs'
layers: 'counts'
obsp: 'connectivities', 'distances'
In [2]:
import pickle
# Load the object from the pickle file
with open("/QRISdata/Q1851/STOmics_SAW_Outputs/lncRNA/visualization/lncRNA_bin20_annotated_cancer.pkl", "rb") as f:
lnc_bin20 = pickle.load(f)
lnc_bin20.var['ENS']=lnc_bin20.var.index
lnc_bin20.var.index=lnc_bin20.var['real_gene_name']
lnc_bin20.raw.var['ENS']=lnc_bin20.raw.var.index
lnc_bin20.raw.var.index=lnc_bin20.raw.var['real_gene_name']
lnc_bin20.var_names = lnc_bin20.var['real_gene_name']
lnc_bin20
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:887: UserWarning:
AnnData expects .var.index to contain strings, but got values like:
['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']
Inferred to be: categorical
names = self._prep_dim_index(names, "var")
Out[2]:
AnnData object with n_obs × n_vars = 512623 × 206923
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [6]:
lnc_bin20.uns['omics']
Out[6]:
array(['Transcriptomics'], dtype=object)
In [29]:
lnc_bin20 = lnc_bin20[:, ~lnc_bin20.var_names.str.startswith("cuTAR")].copy()
lnc_bin20
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Out[29]:
AnnData object with n_obs × n_vars = 512623 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [30]:
import anndata
import pandas as pd
# Assume lnc_bin20 is your AnnData object
# Check current var_names and their type
print("Original var_names:", lnc_bin20.var_names[:5]) # Preview first 5
print("Type of var_names:", type(lnc_bin20.var_names))
print("Is categorical:", pd.api.types.is_categorical_dtype(lnc_bin20.var_names))
# Option 1: Convert var_names to strings and make unique
# This avoids the categorical issue by using a simple string index
lnc_bin20.var_names = lnc_bin20.var_names.astype(str) # Convert to string
lnc_bin20.var_names_make_unique() # Now this should work
# Verify the result
print("Updated var_names:", lnc_bin20.var_names[:5]) # Check first 5
print("Any duplicates:", lnc_bin20.var_names.has_duplicates) # Should be False
# Optional: Save the modified AnnData object
#lnc_bin20.write_h5ad('lnc_bin20_unique.h5ad')
Original var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name') Type of var_names: <class 'pandas.core.indexes.base.Index'> Is categorical: False Updated var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name') Any duplicates: False
In [31]:
# Step 2: Check and fix var_names duplicates BEFORE setting .raw
print("Before fix:")
print("Any duplicates in adata.var_names:", lnc_bin20.var_names.has_duplicates)
dupes = lnc_bin20.var_names[lnc_bin20.var_names.duplicated()]
print("Duplicated gene names:")
print(dupes)
# Fix duplicates
lnc_bin20.var_names_make_unique()
# Step 3: Set .raw after fixing
lnc_bin20.raw = lnc_bin20
# Step 4: Confirm fix
print("\nAfter fix:")
print("Any duplicates in .raw.var_names:", lnc_bin20.raw.var_names.has_duplicates)
Before fix: Any duplicates in adata.var_names: False Duplicated gene names: Index([], dtype='object', name='real_gene_name') After fix: Any duplicates in .raw.var_names: False
In [33]:
lnc_bin20.obs
Out[33]:
| total_counts | n_genes_by_counts | pct_counts_mt | leiden | orig.ident | x | y | Cancer | |
|---|---|---|---|---|---|---|---|---|
| 10222022175580 | 2 | 1 | 0.0 | 9 | sample | 2380 | 11100 | Melanoma |
| 10307921517360 | 1 | 1 | 0.0 | 9 | sample | 2400 | 6960 | Melanoma |
| 10307921520660 | 1 | 1 | 0.0 | 9 | sample | 2400 | 10260 | Melanoma |
| 10307921521540 | 2 | 1 | 0.0 | 9 | sample | 2400 | 11140 | Melanoma |
| 10307921521740 | 1 | 1 | 0.0 | 9 | sample | 2400 | 11340 | Melanoma |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 94832877901060 | 4 | 3 | 0.0 | 8 | sample | 22080 | 5380 | GBM |
| 94832877901080 | 1 | 1 | 0.0 | 9 | sample | 22080 | 5400 | GBM |
| 94832877901100 | 5 | 3 | 0.0 | 8 | sample | 22080 | 5420 | GBM |
| 94832877901120 | 4 | 2 | 0.0 | 9 | sample | 22080 | 5440 | GBM |
| 94832877901140 | 1 | 1 | 0.0 | 9 | sample | 22080 | 5460 | GBM |
512623 rows × 8 columns
In [35]:
adata_mel = lnc_bin20[lnc_bin20.obs['Cancer'] == 'Melanoma'].copy()
adata_mel
Out[35]:
AnnData object with n_obs × n_vars = 194752 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [36]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_mel)
# Logarithmize the data
sc.pp.log1p(adata_mel)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_mel.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
adata_mel.obs["total_counts"][adata_mel.obs["total_counts"] < 10000],
kde=False,
bins=40,
ax=axs[1],
)
sns.histplot(adata_mel.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
adata_mel.obs["n_genes_by_counts"][adata_mel.obs["n_genes_by_counts"] < 4000],
kde=False,
bins=60,
ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
warn(UserWarning('Some cells have zero counts'))
Out[36]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
In [37]:
sc.pp.filter_cells(adata_mel, max_counts=200)
adata_mel = adata_mel[adata_mel.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_mel.n_obs}")
sc.pp.filter_genes(adata_mel, min_cells=10)
adata_mel
#cells after MT filter: 194554
Out[37]:
AnnData object with n_obs × n_vars = 194554 × 23230
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [38]:
sc.tl.pca(adata_mel, use_highly_variable=False)
sc.pp.neighbors(adata_mel)
sc.tl.umap(adata_mel)
In [39]:
sc.tl.leiden(adata_mel, key_added="leiden_res_0.0001", resolution=0.0001)
sc.pl.umap(adata_mel, color=["leiden_res_0.0001"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [40]:
sc.pl.spatial(adata_mel, color=["leiden_res_0.0001"], spot_size=50)
In [41]:
sc.tl.leiden(adata_mel, key_added="leiden_res_0.001", resolution=0.001)
sc.pl.umap(adata_mel, color=["leiden_res_0.001"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [42]:
sc.pl.spatial(adata_mel, color=["leiden_res_0.001"], spot_size=50)
In [47]:
mel_marker_genes = {
#"CRC": ["CEACAM5", "CDX2", "EPCAM", "KRT20", "MUC2", "CLDN1"],
"melanoma": ["MITF", "TYR", "DCT", "MLANA", "PMEL", "SOX10"]
}
# Make unique if duplicates exist
sc.pl.dotplot(
adata_mel,
var_names=mel_marker_genes,
groupby='leiden_res_0.001',
standard_scale='var',
dot_max=0.5,
color_map='Reds',
show=True, use_raw=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [46]:
# Identify marker genes for each cluster
sc.tl.rank_genes_groups(adata_mel, groupby="leiden_res_0.001", method="t-test")#, use_raw=False) # or method="wilcoxon"
sc.pl.rank_genes_groups_dotplot(adata_mel, n_genes=10, groupby="leiden_res_0.001")#, use_raw=False)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [48]:
import pickle
# Save
with open("lnc_Melanpma.pkl", "wb") as f:
pickle.dump(adata_mel, f)
In [77]:
import pickle
# Save
with open("lnc_Melanpma.pkl", "rb") as f:
adata_mel20=pickle.load(f)
In [79]:
mel_marker_genes = {
#"CRC": ["CEACAM5", "CDX2", "EPCAM", "KRT20", "MUC2", "CLDN1"],
"melanoma": ["MITF", "TYR", "DCT", "MLANA", "PMEL", "SOX10","KRT5","PRAME","S100B"]
}
# Make unique if duplicates exist
sc.pl.dotplot(
adata_mel20,
var_names=mel_marker_genes,
groupby='leiden_res_0.001',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
bin50¶
In [2]:
import pickle
# Load the object from the pickle file
with open("/QRISdata/Q1851/STOmics_SAW_Outputs/lncRNA/visualization/lncRNA_bin50_annotated_cancer.pkl", "rb") as f:
lnc_bin50 = pickle.load(f)
lnc_bin50.var['ENS']=lnc_bin50.var.index
lnc_bin50.var.index=lnc_bin50.var['real_gene_name']
lnc_bin50.raw.var['ENS']=lnc_bin50.raw.var.index
lnc_bin50.raw.var.index=lnc_bin50.raw.var['real_gene_name']
lnc_bin50.var_names = lnc_bin50.var['real_gene_name']
lnc_bin50
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:887: UserWarning:
AnnData expects .var.index to contain strings, but got values like:
['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']
Inferred to be: categorical
names = self._prep_dim_index(names, "var")
Out[2]:
AnnData object with n_obs × n_vars = 83575 × 206923
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [3]:
lnc_bin50_all=lnc_bin50
lnc_bin50 = lnc_bin50[:, ~lnc_bin50.var_names.str.startswith("cuTAR")].copy()
lnc_bin50
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Out[3]:
AnnData object with n_obs × n_vars = 83575 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [4]:
lnc_bin50
Out[4]:
AnnData object with n_obs × n_vars = 83575 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [ ]:
sc.pl.spatial(lnc_bin50,color=['cuTAR100799','cuTAR48562','cuTAR81095','cuTAR167236'], spot_size=50, vmax=2, cmap="Reds")
In [60]:
sc.pl.spatial(lnc_bin50,color=['cuTAR141997','cuTAR19863','cuTAR86784','cuTAR30875'], spot_size=70, vmax=2, cmap="Reds")
In [69]:
# pan-cancer
sc.pl.spatial(lnc_bin50,color=['cuTAR262781','cuTAR17199','cuTAR162708','cuTAR87324'], spot_size=80, vmax=2, cmap="Reds")
In [11]:
sc.pl.spatial(lnc_bin50,color=['cuTAR30875','cuTAR48563'], spot_size=80, vmax=2, cmap="Reds")
In [17]:
sc.pl.spatial(lnc_bin50,color=['cuTAR30875'], spot_size=80, vmax=1, cmap="inferno")
In [ ]:
In [75]:
import numpy as np
import pandas as pd
# Step 1: Ensure gene names are strings and unique
lnc_bin50.var_names = pd.Index(lnc_bin50.var_names.astype(str))
lnc_bin50.var_names_make_unique()
# Step 2: Get gene names and filter for cuTAR
genes = lnc_bin50.var_names
cutar_mask = genes.str.startswith("cuTAR")
cutar_genes = genes[cutar_mask]
# Step 3: Subset expression matrix for those genes
cutar_counts = lnc_bin50[:, cutar_genes].X
# Step 4: Count number of cells each gene is expressed in
if hasattr(cutar_counts, 'getnnz'): # sparse
cell_counts = cutar_counts.getnnz(axis=0)
else:
cell_counts = np.sum(cutar_counts > 0, axis=0)
# Step 5: Count genes expressed in at least 3 cells
num_cutars_expressed3 = np.sum(cell_counts >= 3)
num_cutars_expressed5 = np.sum(cell_counts >= 5)
num_cutars_expressed10 = np.sum(cell_counts >= 10)
num_cutars_expressed100 = np.sum(cell_counts >= 100)
print("cuTAR genes expressed in ≥3 cells:", num_cutars_expressed3)
print("cuTAR genes expressed in ≥5 cells:", num_cutars_expressed5)
print("cuTAR genes expressed in ≥10 cells:", num_cutars_expressed10)
print("cuTAR genes expressed in ≥100 cells:", num_cutars_expressed100)
cuTAR genes expressed in ≥3 cells: 116860 cuTAR genes expressed in ≥5 cells: 88045 cuTAR genes expressed in ≥10 cells: 52417 cuTAR genes expressed in ≥100 cells: 1691
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
In [9]:
# Step 6: Save the list of cuTARs expressed in ≥3 cells
cutar_genes_expressed3 = cutar_genes[cell_counts >= 3]
cutar_genes_expressed3.to_series().to_csv("cuTAR_expressed_in_3_or_more_cells.txt", index=False, header=False)
In [ ]:
In [37]:
import pandas as pd
import numpy as np
# List of target cuTAR genes
target_genes = ['cuTAR141997','cuTAR288960','cuTAR94762','cuTAR280980','cuTAR201736', 'cuTAR42228',
'cuTAR19863', 'cuTAR86784', 'cuTAR78711']
# Ensure gene names are strings and unique
lnc_bin50.var_names = pd.Index(lnc_bin50.var_names.astype(str))
lnc_bin50.var_names_make_unique()
# Filter to existing genes only
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]
if not existing_genes:
print("None of the target genes found in the dataset.")
else:
# Subset expression matrix for the selected genes
expr = lnc_bin50[:, existing_genes].X
# Convert to dense if sparse
if hasattr(expr, 'toarray'):
expr = expr.toarray()
# Create DataFrame: rows = cells, columns = genes
expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)
# Add Cancer info
expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values
# Convert expression to binary (expressed > 0)
binary_expr = expr_df[existing_genes] > 0
binary_expr['Cancer'] = expr_df['Cancer']
# Count number of expressing cells per gene per Cancer group
summary = binary_expr.groupby('Cancer')[existing_genes].sum()
print("Number of cells expressing each gene per Cancer group:")
print(summary)
import seaborn as sns
import matplotlib.pyplot as plt
# Reset index so 'Cancer' becomes a column for plotting
summary_reset = summary.reset_index()
# Convert to long format for seaborn
plot_df = summary_reset.melt(id_vars="Cancer",
var_name="Gene",
value_name="Cell Count")
# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x="Cancer", y="Cell Count", hue="Gene")
plt.title("Number of Cells Expressing Each cuTAR Gene per Cancer Sample")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="cuTAR Gene")
plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
Number of cells expressing each gene per Cancer group:
cuTAR141997 cuTAR288960 cuTAR94762 cuTAR280980 \
Cancer
Colorectal_cancer 16 11 4 0
GBM 38 19 5 1
Melanoma 56 18 10 0
cuTAR201736 cuTAR42228 cuTAR19863 cuTAR86784 cuTAR78711
Cancer
Colorectal_cancer 11 2 2 2 0
GBM 32 2 6 11 2
Melanoma 25 10 3 7 0
In [49]:
target_genes = ['cuTAR141997','cuTAR94762', 'cuTAR42228','cuTAR59116','cuTAR291293','cuTAR30875','cuTAR23054','cuTAR293520']
# Filter for genes that exist in var_names
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]
if not existing_genes:
print("None of the target cuTAR genes found.")
else:
# Subset expression
expr = lnc_bin50[:, existing_genes].X
# Convert sparse matrix to dense if needed
if hasattr(expr, 'toarray'):
expr = expr.toarray()
# Create DataFrame (cells x genes)
expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)
# Add Cancer group info
expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values
# Group by Cancer and compute total expression (sum)
total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()
# Melt for plotting
plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')
# Plot total expression
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x='Cancer', y='Total Expression', hue='Gene')
plt.title("Total Expression of cuTAR Genes per Cancer Group")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="Gene")
plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
In [59]:
target_genes = ['cuTAR141997','cuTAR94762', 'cuTAR42228','cuTAR59116','cuTAR291293','cuTAR30875','cuTAR293520']
# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]
if not existing_genes:
print("None of the target cuTAR genes found.")
else:
# Get expression matrix
expr = lnc_bin50[:, existing_genes].X
# Convert to dense if sparse
if hasattr(expr, 'toarray'):
expr = expr.toarray()
# Create DataFrame: cells x genes
expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)
# Add Cancer group/sample info
expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values
# Group by Cancer and compute total expression
total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()
# Reshape for plotting
plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')
# Plot: genes on X-axis, grouped by Cancer
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')
plt.title("Total Expression of Melanoma specific cuTARs")
plt.tight_layout()
plt.legend(title='Sample')
plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# ---- target genes ----
target_genes = ['cuTAR141997','cuTAR94762', 'cuTAR42228','cuTAR59116','cuTAR291293','cuTAR30875','cuTAR293520']
# Ensure gene names are strings and unique
lnc_bin50.var_names = pd.Index(lnc_bin50.var_names.astype(str))
lnc_bin50.var_names_make_unique()
# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]
if not existing_genes:
print("None of the target cuTAR genes found.")
else:
# Get expression matrix
expr = lnc_bin50[:, existing_genes].X
# Convert to dense if sparse
if hasattr(expr, 'toarray'):
expr = expr.toarray()
# Create DataFrame: cells x genes
expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)
# Add Cancer group/sample info
expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values
# Group by Cancer and compute total expression
total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()
# Reshape for plotting
plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')
# ---- Plot and save as PDF ----
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')
plt.title("Total Expression of Melanoma specific cuTARs")
plt.tight_layout()
plt.legend(title='Sample')
# Save
output_pdf = "melanoma_cuTAR_expression_stomics.pdf"
plt.savefig(output_pdf, format="pdf")
plt.close()
print(f"Saved plot to {output_pdf}")
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
Saved plot to melanoma_cuTAR_expression_stomics.pdf
In [74]:
# CRC cuTARs
target_genes = ['cuTAR86784','cuTAR328192','cuTAR88740','cuTAR323949','cuTAR283862']
# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]
if not existing_genes:
print("None of the target cuTAR genes found.")
else:
# Get expression matrix
expr = lnc_bin50[:, existing_genes].X
# Convert to dense if sparse
if hasattr(expr, 'toarray'):
expr = expr.toarray()
# Create DataFrame: cells x genes
expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)
# Add Cancer group/sample info
expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values
# Group by Cancer and compute total expression
total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()
# Reshape for plotting
plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')
# Plot: genes on X-axis, grouped by Cancer
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')
plt.title("Total Expression of CRC cuTARs")
plt.tight_layout()
plt.legend(title='Sample')
plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
In [58]:
# pan cancer cuTARs
target_genes = ['cuTAR19863',
'cuTAR291690',
'cuTAR262781',
'cuTAR137818',
'cuTAR87324',
#'cuTAR170802',
'cuTAR51249',
'cuTAR129713',
'cuTAR100897',
'cuTAR171799',
'cuTAR301342',
'cuTAR162708',
'cuTAR162807',
'cuTAR186121']
#target_genes = ['cuTAR78711','cuTAR276859','cuTAR290541','cuTAR270431','cuTAR207309','cuTAR6939'] # GBM specific
# Filter to only existing genes
existing_genes = [g for g in target_genes if g in lnc_bin50.var_names]
if not existing_genes:
print("None of the target cuTAR genes found.")
else:
# Get expression matrix
expr = lnc_bin50[:, existing_genes].X
# Convert to dense if sparse
if hasattr(expr, 'toarray'):
expr = expr.toarray()
# Create DataFrame: cells x genes
expr_df = pd.DataFrame(expr, columns=existing_genes, index=lnc_bin50.obs_names)
# Add Cancer group/sample info
expr_df['Cancer'] = lnc_bin50.obs['Cancer'].values
# Group by Cancer and compute total expression
total_expr = expr_df.groupby('Cancer')[existing_genes].sum().reset_index()
# Reshape for plotting
plot_df = total_expr.melt(id_vars='Cancer', var_name='Gene', value_name='Total Expression')
# Plot: genes on X-axis, grouped by Cancer
plt.figure(figsize=(10, 6))
sns.barplot(data=plot_df, x='Gene', y='Total Expression', hue='Cancer')
plt.title("Total Expression of Pan-cancer cuTARs")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title='Sample')
plt.show()
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
In [ ]:
In [ ]:
In [ ]:
In [4]:
import anndata
import pandas as pd
print("Original var_names:", lnc_bin50.var_names[:5]) # Preview first 5
print("Type of var_names:", type(lnc_bin50.var_names))
print("Is categorical:", pd.api.types.is_categorical_dtype(lnc_bin50.var_names))
# Option 1: Convert var_names to strings and make unique
# This avoids the categorical issue by using a simple string index
lnc_bin50.var_names = lnc_bin50.var_names.astype(str) # Convert to string
lnc_bin50.var_names_make_unique() # Now this should work
# Verify the result
print("Updated var_names:", lnc_bin50.var_names[:5]) # Check first 5
print("Any duplicates:", lnc_bin50.var_names.has_duplicates) # Should be False
Original var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name') Type of var_names: <class 'pandas.core.indexes.base.Index'> Is categorical: False Updated var_names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name') Any duplicates: False
In [5]:
# Step 2: Check and fix var_names duplicates BEFORE setting .raw
print("Before fix:")
print("Any duplicates in adata.var_names:", lnc_bin50.var_names.has_duplicates)
dupes = lnc_bin50.var_names[lnc_bin50.var_names.duplicated()]
print("Duplicated gene names:")
print(dupes)
# Fix duplicates
lnc_bin50.var_names_make_unique()
# Step 3: Set .raw after fixing
lnc_bin50.raw = lnc_bin50
# Step 4: Confirm fix
print("\nAfter fix:")
print("Any duplicates in .raw.var_names:", lnc_bin50.raw.var_names.has_duplicates)
Before fix: Any duplicates in adata.var_names: False Duplicated gene names: Index([], dtype='object', name='real_gene_name') After fix: Any duplicates in .raw.var_names: False
In [4]:
adata_mel = lnc_bin50[lnc_bin50.obs['Cancer'] == 'Melanoma'].copy()
adata_mel
Out[4]:
AnnData object with n_obs × n_vars = 31955 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [23]:
sc.pl.spatial(adata_mel,color=['cuTAR94762','cuTAR288960'], spot_size=65, vmax=1, cmap="inferno", show=False)
Out[23]:
[<Axes: title={'center': 'cuTAR94762'}, xlabel='spatial1', ylabel='spatial2'>,
<Axes: title={'center': 'cuTAR288960'}, xlabel='spatial1', ylabel='spatial2'>]
In [15]:
import matplotlib.pyplot as plt
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
sc.pl.spatial(adata_mel,color=['cuTAR94762'], spot_size=65, vmax=1, cmap="inferno", show=False)
plt.savefig("/scratch/project/stseq/Prakrithi/STOMICS/cuTAR94762_mel.pdf", bbox_inches="tight")
In [11]:
sc.pl.spatial(lnc_bin50_all,color=['cuTAR78711','cuTAR276859'], spot_size=55, vmax=1, cmap="Reds", show=False)
Out[11]:
[<Axes: title={'center': 'cuTAR78711'}, xlabel='spatial1', ylabel='spatial2'>,
<Axes: title={'center': 'cuTAR276859'}, xlabel='spatial1', ylabel='spatial2'>]
In [ ]:
In [31]:
import matplotlib.pyplot as plt
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
sc.pl.spatial(adata_mel,color=['cuTAR30875'], spot_size=65, vmax=1, cmap="inferno", show=False)
plt.savefig("/scratch/project/stseq/Prakrithi/STOMICS/cuTAR30875_mel.pdf", bbox_inches="tight")
In [7]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_mel)
# Logarithmize the data
sc.pp.log1p(adata_mel)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_mel.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
adata_mel.obs["total_counts"][adata_mel.obs["total_counts"] < 10000],
kde=False,
bins=40,
ax=axs[1],
)
sns.histplot(adata_mel.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
adata_mel.obs["n_genes_by_counts"][adata_mel.obs["n_genes_by_counts"] < 4000],
kde=False,
bins=60,
ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
warn(UserWarning('Some cells have zero counts'))
Out[7]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
In [8]:
sc.pp.filter_cells(adata_mel, max_counts=800)
adata_mel = adata_mel[adata_mel.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_mel.n_obs}")
sc.pp.filter_genes(adata_mel, min_cells=10)
adata_mel
#cells after MT filter: 31949
Out[8]:
AnnData object with n_obs × n_vars = 31949 × 23229
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [9]:
sc.tl.pca(adata_mel, use_highly_variable=False)
sc.pp.neighbors(adata_mel)
sc.tl.umap(adata_mel)
2025-06-03 11:45:11.865623: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2025-06-03 11:45:13.703442: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2025-06-03 11:45:14.315555: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory 2025-06-03 11:45:14.315596: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. 2025-06-03 11:45:20.703254: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2025-06-03 11:45:20.703475: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2025-06-03 11:45:20.703483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
In [18]:
sc.tl.leiden(adata_mel, key_added="leiden_res_0.2", resolution=0.2)
sc.pl.umap(adata_mel, color=["leiden_res_0.2"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [19]:
sc.pl.spatial(adata_mel, color=["leiden_res_0.1","leiden_res_0.2","leiden_res_0.3","leiden_res_0.5"], spot_size=50)
In [61]:
sc.pl.spatial(adata_mel, spot_size=50, color='total_counts')
In [76]:
mel_marker_genes = {
#"CRC": ["CEACAM5", "CDX2", "EPCAM", "KRT20", "MUC2", "CLDN1"],
"melanoma": ["MITF", "TYR", "DCT", "MLANA", "PMEL", "SOX10","KRT5","PRAME","S100B"]
}
# Make unique if duplicates exist
sc.pl.dotplot(
adata_mel,
var_names=mel_marker_genes,
groupby='leiden_res_0.5',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [ ]:
In [30]:
adata_mel_all = lnc_bin50_all[lnc_bin50_all.obs['Cancer'] == 'Melanoma'].copy()
adata_mel_all
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Out[30]:
AnnData object with n_obs × n_vars = 31955 × 206923
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [38]:
adata_mel_all.obs['leiden_res_0.5']=adata_mel.obs['leiden_res_0.5']
adata_mel_all.obs
Out[38]:
| total_counts | n_genes_by_counts | pct_counts_mt | leiden | orig.ident | x | y | Cancer | leiden_res_0.5 | |
|---|---|---|---|---|---|---|---|---|---|
| 10093173156700 | 2 | 1 | 0.0 | 8 | sample | 2350 | 11100 | Melanoma | 1 |
| 10307921515800 | 1 | 1 | 0.0 | 8 | sample | 2400 | 5400 | Melanoma | 1 |
| 10307921516300 | 1 | 1 | 0.0 | 8 | sample | 2400 | 5900 | Melanoma | 1 |
| 10307921516600 | 3 | 1 | 0.0 | 8 | sample | 2400 | 6200 | Melanoma | 1 |
| 10307921517350 | 2 | 2 | 0.0 | 8 | sample | 2400 | 6950 | Melanoma | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43164421343700 | 22 | 20 | 0.0 | 6 | sample | 10050 | 18900 | Melanoma | 1 |
| 43164421343750 | 86 | 59 | 0.0 | 5 | sample | 10050 | 18950 | Melanoma | 21 |
| 43164421343800 | 8 | 8 | 0.0 | 8 | sample | 10050 | 19000 | Melanoma | 1 |
| 43379169708500 | 25 | 19 | 0.0 | 6 | sample | 10100 | 18900 | Melanoma | 18 |
| 43379169708550 | 27 | 24 | 0.0 | 6 | sample | 10100 | 18950 | Melanoma | 7 |
31955 rows × 9 columns
In [ ]:
In [33]:
# Step 0: Make gene names unique
adata_mel_all.var_names_make_unique()
# Step 1: Filter genes starting with "cuTAR"
cutar_genes = [gene for gene in adata_mel_all.var_names if gene.startswith("cuTAR")]
# Step 2: Subset AnnData for only cuTAR genes
adata_cutar = adata_mel_all[:, cutar_genes]
In [ ]:
In [41]:
import scanpy as sc
import pandas as pd
# Assume adata_mel_all is your AnnData object
# Step 1: Check cluster labels and gene names
print("Cluster labels:", adata_mel_all.obs['leiden_res_0.5'].value_counts())
print("First few gene names:", adata_mel_all.var_names[:5])
# Step 2: Preprocess data
# Check if data is raw counts
print("Sample of raw data:", adata_mel_all.X[:5, :5].toarray()) # Preview first 5 cells, 5 genes
# Normalize total counts (e.g., to 10,000 per cell) and logarithmize
sc.pp.normalize_total(adata_mel_all, target_sum=1e4)
sc.pp.log1p(adata_mel_all) # Log-transform (log(1 + x))
print("Sample of normalized, log-transformed data:", adata_mel_all.X[:5, :5].toarray())
# Step 3: Filter genes starting with "cuTAR"
cuTAR_genes = [gene for gene in adata_mel_all.var_names if gene.startswith('cuTAR')]
print(f"Number of cuTAR genes: {len(cuTAR_genes)}")
print("First few cuTAR genes:", cuTAR_genes[:5])
# Step 4: Subset AnnData to only cuTAR genes
adata_cuTAR = adata_mel_all[:, cuTAR_genes].copy()
# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.5', groups=['0'], reference='rest', method='wilcoxon')
# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='0')
# Filter for highly expressed genes in cluster 0 (positive log fold change, significant p-value)
highly_de = de_results[
(de_results['logfoldchanges'] > 0) & # Upregulated in cluster 0
(de_results['pvals_adj'] < 0.05) # Adjusted p-value < 0.05 for significance
].sort_values('logfoldchanges', ascending=False) # Sort by log fold change (highest first)
# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 0:")
print(highly_de[['names', 'logfoldchanges', 'pvals_adj']])
# Optional: Save results to a CSV file
#highly_de.to_csv('de_cuTAR_cluster0.csv', index=False)
Cluster labels: 0 4836
1 4248
2 1526
3 1289
4 1278
5 1227
6 1193
7 1159
8 1113
9 1065
10 1017
11 984
12 943
13 878
14 876
15 874
16 859
17 854
18 836
19 816
20 808
21 726
22 714
23 665
24 518
25 412
26 175
27 60
Name: leiden_res_0.5, dtype: int64
First few gene names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Sample of raw data: [[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]]
Sample of normalized, log-transformed data: [[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]]
Number of cuTAR genes: 172364
First few cuTAR genes: ['cuTAR1', 'cuTAR10', 'cuTAR100', 'cuTAR1000', 'cuTAR10000']
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 0:
names logfoldchanges pvals_adj
0 cuTAR48563 5.034541 0.000000e+00
11 AC010967.1 3.745001 2.496113e-24
115 TYR 3.676590 1.014090e-02
1 RBFOX1 3.612103 1.175379e-148
25 PCSK2 3.520043 3.017661e-10
.. ... ... ...
69 cuTAR127 0.656923 1.623822e-04
27 cuTAR167235 0.615259 8.369745e-10
113 cuTAR100799 0.559353 6.501664e-03
56 cuTAR162694 0.546771 1.473693e-05
33 cuTAR170802 0.531583 5.421090e-08
[156 rows x 3 columns]
In [42]:
highly_de
Out[42]:
| names | scores | logfoldchanges | pvals | pvals_adj | |
|---|---|---|---|---|---|
| 0 | cuTAR48563 | 43.581669 | 5.034541 | 0.000000e+00 | 0.000000e+00 |
| 11 | AC010967.1 | 11.087194 | 3.745001 | 1.447561e-28 | 2.496113e-24 |
| 115 | TYR | 4.537781 | 3.676590 | 5.684936e-06 | 1.014090e-02 |
| 1 | RBFOX1 | 26.407085 | 3.612103 | 1.136055e-153 | 1.175379e-148 |
| 25 | PCSK2 | 7.567946 | 3.520043 | 3.791710e-14 | 3.017661e-10 |
| ... | ... | ... | ... | ... | ... |
| 69 | cuTAR127 | 5.434558 | 0.656923 | 5.493227e-08 | 1.623822e-04 |
| 27 | cuTAR167235 | 7.424442 | 0.615259 | 1.132561e-13 | 8.369745e-10 |
| 113 | cuTAR100799 | 4.634272 | 0.559353 | 3.581959e-06 | 6.501664e-03 |
| 56 | cuTAR162694 | 5.881750 | 0.546771 | 4.059505e-09 | 1.473693e-05 |
| 33 | cuTAR170802 | 6.823134 | 0.531583 | 8.907519e-12 | 5.421090e-08 |
156 rows × 5 columns
In [43]:
import scanpy as sc
import pandas as pd
# Assume adata_mel_all is your AnnData object
# Step 1: Check cluster labels and gene names
print("Cluster labels:", adata_mel_all.obs['leiden_res_0.5'].value_counts())
print("First few gene names:", adata_mel_all.var_names[:5])
# Step 2: Preprocess data
# Check if data is raw counts
print("Sample of raw data:", adata_mel_all.X[:5, :5].toarray())
# Normalize total counts and logarithmize
sc.pp.normalize_total(adata_mel_all, target_sum=1e4)
sc.pp.log1p(adata_mel_all)
print("Sample of normalized, log-transformed data:", adata_mel_all.X[:5, :5].toarray())
# Step 3: Filter genes starting with "cuTAR"
cuTAR_genes = [gene for gene in adata_mel_all.var_names if gene.startswith('cuTAR')]
print(f"Number of cuTAR genes: {len(cuTAR_genes)}")
print("First few cuTAR genes:", cuTAR_genes[:5])
# Step 4: Subset AnnData to only cuTAR genes
adata_cuTAR = adata_mel_all[:, cuTAR_genes].copy()
# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.5', groups=['0'], reference='rest', method='wilcoxon')
# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='0')
# Filter for highly expressed genes in cluster 0 and only cuTAR genes
highly_de_cuTAR = de_results[
(de_results['logfoldchanges'] > 0) & # Upregulated in cluster 0
(de_results['pvals_adj'] < 0.05) & # Adjusted p-value < 0.05 for significance
(de_results['names'].str.startswith('cuTAR')) # Only cuTAR genes
].sort_values('logfoldchanges', ascending=False) # Sort by log fold change (highest first)
# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 0:")
print(highly_de_cuTAR[['names', 'logfoldchanges', 'pvals_adj']])
# Optional: Save results to a CSV file
#
Cluster labels: 0 4836
1 4248
2 1526
3 1289
4 1278
5 1227
6 1193
7 1159
8 1113
9 1065
10 1017
11 984
12 943
13 878
14 876
15 874
16 859
17 854
18 836
19 816
20 808
21 726
22 714
23 665
24 518
25 412
26 175
27 60
Name: leiden_res_0.5, dtype: int64
First few gene names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name')
Sample of raw data: [[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]]
WARNING: adata.X seems to be already log-transformed.
Sample of normalized, log-transformed data: [[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]]
Number of cuTAR genes: 172364
First few cuTAR genes: ['cuTAR1', 'cuTAR10', 'cuTAR100', 'cuTAR1000', 'cuTAR10000']
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 0:
names logfoldchanges pvals_adj
0 cuTAR48563 5.034541 0.000000e+00
24 cuTAR48562 1.592207 2.117516e-10
50 cuTAR141754 1.585270 5.831195e-06
62 cuTAR141755 1.205844 5.697473e-05
4 cuTAR167240 0.919206 3.277730e-40
8 cuTAR81095 0.915182 1.242874e-30
3 cuTAR64 0.891957 3.858411e-46
17 cuTAR167236 0.833358 3.023247e-15
14 cuTAR39 0.764167 1.916128e-15
21 cuTAR167239 0.746357 2.033063e-11
15 cuTAR162573 0.692954 2.232043e-15
12 cuTAR63 0.686892 1.104622e-20
69 cuTAR127 0.656923 1.623822e-04
27 cuTAR167235 0.615259 8.369745e-10
113 cuTAR100799 0.559353 6.501664e-03
56 cuTAR162694 0.546771 1.473693e-05
33 cuTAR170802 0.531583 5.421090e-08
In [44]:
highly_de_cuTAR.to_csv('de_cuTAR_cluster0_bin50_mel.csv', index=False)
In [45]:
highly_de_cuTAR
Out[45]:
| names | scores | logfoldchanges | pvals | pvals_adj | |
|---|---|---|---|---|---|
| 0 | cuTAR48563 | 43.581669 | 5.034541 | 0.000000e+00 | 0.000000e+00 |
| 24 | cuTAR48562 | 7.618906 | 1.592207 | 2.558338e-14 | 2.117516e-10 |
| 50 | cuTAR141754 | 6.051256 | 1.585270 | 1.437206e-09 | 5.831195e-06 |
| 62 | cuTAR141755 | 5.636575 | 1.205844 | 1.734659e-08 | 5.697473e-05 |
| 4 | cuTAR167240 | 14.048035 | 0.919206 | 7.920169e-45 | 3.277730e-40 |
| 8 | cuTAR81095 | 12.341619 | 0.915182 | 5.405811e-35 | 1.242874e-30 |
| 3 | cuTAR64 | 14.998953 | 0.891957 | 7.458642e-51 | 3.858411e-46 |
| 17 | cuTAR167236 | 8.983207 | 0.833358 | 2.629889e-19 | 3.023247e-15 |
| 14 | cuTAR39 | 9.053150 | 0.764167 | 1.389016e-19 | 1.916128e-15 |
| 21 | cuTAR167239 | 7.931706 | 0.746357 | 2.161548e-15 | 2.033063e-11 |
| 15 | cuTAR162573 | 9.029417 | 0.692954 | 1.725892e-19 | 2.232043e-15 |
| 12 | cuTAR63 | 10.301462 | 0.686892 | 6.939823e-25 | 1.104622e-20 |
| 69 | cuTAR127 | 5.434558 | 0.656923 | 5.493227e-08 | 1.623822e-04 |
| 27 | cuTAR167235 | 7.424442 | 0.615259 | 1.132561e-13 | 8.369745e-10 |
| 113 | cuTAR100799 | 4.634272 | 0.559353 | 3.581959e-06 | 6.501664e-03 |
| 56 | cuTAR162694 | 5.881750 | 0.546771 | 4.059505e-09 | 1.473693e-05 |
| 33 | cuTAR170802 | 6.823134 | 0.531583 | 8.907519e-12 | 5.421090e-08 |
In [73]:
sc.pl.spatial(adata_mel_all, color=['cuTAR48562','cuTAR141755'], cmap="inferno", spot_size=70, vmax=2) #reds spot 70, vmax=5
In [48]:
# Make unique if duplicates exist
sc.pl.dotplot(
adata_mel,
var_names=highly_de_cuTAR['names'],
groupby='leiden_res_0.5',
standard_scale='var',
dot_max=0.5,
color_map='Reds',
show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [82]:
genes_to_plot = [
# Innate immune sensors
"TLR1", "TLR2", "TLR3", "TLR4", "TLR5", "TLR7", "TLR8", "TLR9",
"NOD1", "NOD2", "RIGI", "MDA5", "STING1",
# Cytokines and receptors
"IL1B", "IL6", "IL10", "IL12A", "IL12B", "TNF", "IFNA1", "IFNB1", "IFNG",
"CXCL8", "CCL2", "CCL3", "CXCL10", "CCR5", "CXCR3",
# Antigen presentation
"HLA-A", "HLA-B", "HLA-C", "HLA-DRA", "HLA-DRB1", "CD74", "TAP1", "TAP2",
# T cell markers
"CD3D", "CD3E", "CD4", "CD8A", "CD8B", "IL2RA", "FOXP3", "CTLA4",
# B cell markers
"CD19", "CD20", "CD79A", "CD79B", "MS4A1",
# NK cell markers
"NCAM1", "KLRD1", "NKG7", "GNLY", "GZMB", "PRF1",
# Macrophage/monocyte markers
"CD14", "CD68", "CD163", "CSF1R", "ITGAM",
# Dendritic cell markers
"ITGAX", "CD80", "CD86", "CCR7",
# Inflammation and signaling
"STAT1", "STAT3", "IRF1", "IRF3", "IRF7", "NFKB1", "RELA"
]
# Ensure genes are in adata.var_names
genes_to_plot = [gene for gene in genes_to_plot if gene in adata_mel.var_names]
if not genes_to_plot:
raise ValueError("None of the specified genes are in adata.var_names")
sc.pl.dotplot(
adata_mel,
var_names=genes_to_plot,
groupby='leiden_res_0.5', # Assumes adata.obs['organ'] has 'salivary_gland', 'midgut'
standard_scale='var', # Scales expression to [0, 1] per gene
color_map='Reds', # Color map for mean expression
show=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [97]:
import pickle
# Save
with open("lnc_Melanoma_processed_bin50.pkl", "wb") as f:
pickle.dump(adata_mel, f)
with open("lnc_Melanoma_processed_bin50_with_cuTARs.pkl", "wb") as f:
pickle.dump(adata_mel_all, f)
In [ ]:
In [ ]:
In [ ]:
GBM¶
In [83]:
adata_GBM = lnc_bin50[lnc_bin50.obs['Cancer'] == 'GBM'].copy()
adata_GBM
Out[83]:
AnnData object with n_obs × n_vars = 30319 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [84]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_GBM)
# Logarithmize the data
sc.pp.log1p(adata_GBM)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_GBM.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
adata_GBM.obs["total_counts"][adata_GBM.obs["total_counts"] < 10000],
kde=False,
bins=40,
ax=axs[1],
)
sns.histplot(adata_GBM.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
adata_GBM.obs["n_genes_by_counts"][adata_GBM.obs["n_genes_by_counts"] < 4000],
kde=False,
bins=60,
ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
warn(UserWarning('Some cells have zero counts'))
Out[84]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
In [85]:
sc.pp.filter_cells(adata_GBM, max_counts=500)
adata_GBM = adata_GBM[adata_GBM.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_GBM.n_obs}")
sc.pp.filter_genes(adata_GBM, min_cells=10)
adata_GBM
#cells after MT filter: 30306
Out[85]:
AnnData object with n_obs × n_vars = 30306 × 22930
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [86]:
sc.tl.pca(adata_GBM, use_highly_variable=False)
sc.pp.neighbors(adata_GBM)
sc.tl.umap(adata_GBM)
In [87]:
sc.tl.leiden(adata_GBM, key_added="leiden_res_0.2", resolution=0.2)
sc.pl.umap(adata_GBM, color=["leiden_res_0.2"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [94]:
sc.tl.leiden(adata_GBM, key_added="leiden_res_0.3", resolution=0.3)
sc.pl.umap(adata_GBM, color=["leiden_res_0.3"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [95]:
sc.pl.spatial(adata_GBM, color=["leiden_res_0.2","leiden_res_0.3","leiden_res_0.5"], spot_size=70)
#sc.pl.spatial(adata_GBM, color=["leiden_res_0.1","leiden_res_0.2","leiden_res_0.3","leiden_res_0.5"], spot_size=50)
In [96]:
gbm_marker_genes = [
# General GBM markers
"EGFR", "PDGFRA", "IDH1", "TP53", "NF1", "MGMT", "MDM2", "MDM4",
# Proliferation / stemness
"MKI67", "PROM1", "NES",
# Mesenchymal subtype / immune interaction
"CD44", "STAT3", "TNFAIP3", "CD274",
# Proneural subtype
"PDGFRA",
# Classical subtype
"EGFR", "NES", "BCAN",
# Neural subtype (less reliable in bulk)
"SLC12A5", "SYT1", "SNAP25",
# Microenvironment / immune
"ITGAM", "CX3CR1", "PTPRC", "CD163", "CSF1R", "HLA-DRA",
# Astrocytic / glial markers
"AQP4",
# Oligodendrocyte / OPC
"PDGFRA", "CSPG4", "MBP"
]
# Ensure genes are in adata.var_names
genes_to_plot = [gene for gene in genes_to_plot if gene in adata_GBM.var_names]
if not genes_to_plot:
raise ValueError("None of the specified genes are in adata.var_names")
sc.pl.dotplot(
adata_GBM,
var_names=gbm_marker_genes,
groupby='leiden_res_0.3',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
CRC¶
In [10]:
adata_CRC = lnc_bin50[lnc_bin50.obs['Cancer'] == 'Colorectal_cancer'].copy()
adata_CRC
Out[10]:
AnnData object with n_obs × n_vars = 21301 × 34559
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [11]:
# Normalizing to median total counts
sc.pp.normalize_total(adata_CRC)
# Logarithmize the data
sc.pp.log1p(adata_CRC)
fig, axs = plt.subplots(1, 4, figsize=(15, 4))
sns.histplot(adata_CRC.obs["total_counts"], kde=False, ax=axs[0])
sns.histplot(
adata_CRC.obs["total_counts"][adata_CRC.obs["total_counts"] < 10000],
kde=False,
bins=40,
ax=axs[1],
)
sns.histplot(adata_CRC.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
sns.histplot(
adata_CRC.obs["n_genes_by_counts"][adata_CRC.obs["n_genes_by_counts"] < 4000],
kde=False,
bins=60,
ax=axs[3],
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:196: UserWarning: Some cells have zero counts
warn(UserWarning('Some cells have zero counts'))
Out[11]:
<Axes: xlabel='n_genes_by_counts', ylabel='Count'>
In [12]:
sc.pp.filter_cells(adata_CRC, max_counts=400)
adata_CRC = adata_CRC[adata_CRC.obs["pct_counts_mt"] < 20].copy()
print(f"#cells after MT filter: {adata_CRC.n_obs}")
sc.pp.filter_genes(adata_CRC, min_cells=10)
adata_CRC
#cells after MT filter: 21294
Out[12]:
AnnData object with n_obs × n_vars = 21294 × 18284
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'n_counts'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [13]:
sc.tl.pca(adata_CRC, use_highly_variable=False)
sc.pp.neighbors(adata_CRC)
sc.tl.umap(adata_CRC)
2025-06-04 11:58:57.395591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2025-06-04 11:59:02.329185: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory 2025-06-04 11:59:02.329240: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. 2025-06-04 11:59:57.703533: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory 2025-06-04 11:59:57.703662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory 2025-06-04 11:59:57.703672: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
In [20]:
sc.tl.leiden(adata_CRC, key_added="leiden_res_0.3", resolution=0.3)
sc.pl.umap(adata_CRC, color=["leiden_res_0.3"],legend_loc="on data")
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [24]:
sc.pl.spatial(adata_CRC, color=["leiden_res_0.2","leiden_res_0.3"], spot_size=60) #,"leiden_res_0.5"
In [22]:
CRC_marker_genes = [
"APC", # Tumor suppressor, often mutated early
"TP53", # Tumor suppressor, frequently mutated
"KRAS", # Oncogene, common driver mutation
"BRAF", # Oncogene, V600E mutation often in MSI-high CRC
"PIK3CA", # Involved in PI3K-AKT signaling
"SMAD4", # TGF-beta signaling, tumor suppressor
"MLH1", # DNA mismatch repair gene
"MSH2", # Mismatch repair gene, Lynch syndrome
"MSH6", # Mismatch repair gene
"PMS2", # Mismatch repair gene
"CDX2", # Differentiation marker for intestinal epithelium
"CEACAM5", # CEA – used as a serum marker
"EPCAM", # Cell adhesion, involved in Lynch syndrome
"LGR5", # Stem cell marker in colorectal epithelium
"AXIN2", # Wnt signaling pathway component
"CTNNB1", # β-catenin, Wnt pathway
"MET", # Receptor tyrosine kinase, prognostic
"VEGFA", # Angiogenesis, target of bevacizumab
"EGFR", # Target of cetuximab/panitumumab
"ALDH1A1", # Cancer stem cell marker
"SLC6A6", # Overexpressed in CRC
"MMP7", # Matrix metalloproteinase, invasion marker
"TNFRSF10B", # Death receptor pathway
"CXCL1", # Inflammatory chemokine, pro-tumorigenic
"CD44", # Stem cell and EMT marker
]
genes_to_plot=CRC_marker_genes
# Ensure genes are in adata.var_names
genes_to_plot = [gene for gene in genes_to_plot if gene in adata_CRC.var_names]
if not genes_to_plot:
raise ValueError("None of the specified genes are in adata.var_names")
sc.pl.dotplot(
adata_CRC,
var_names=CRC_marker_genes,
groupby='leiden_res_0.3',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [45]:
sc.pl.dotplot(
adata_CRC,
var_names=CRC_marker_genes,
groupby='leiden_res_0.2',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [26]:
adata_CRC_all = lnc_bin50_all[lnc_bin50_all.obs['Cancer'] == 'Colorectal_cancer'].copy()
adata_CRC_all.obs['leiden_res_0.3']=adata_CRC.obs['leiden_res_0.3']
adata_CRC_all.obs
# Step 0: Make gene names unique
adata_CRC_all.var_names_make_unique()
# Step 1: Filter genes starting with "cuTAR"
cutar_genes = [gene for gene in adata_CRC_all.var_names if gene.startswith("cuTAR")]
# Step 2: Subset AnnData for only cuTAR genes
adata_cutar = adata_CRC_all[:, cutar_genes]
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
/home/s4716765/.local/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
In [40]:
adata_CRC_all
Out[40]:
AnnData object with n_obs × n_vars = 21301 × 206923
obs: 'total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden', 'orig.ident', 'x', 'y', 'Cancer', 'leiden_res_0.3'
var: 'real_gene_name', 'n_cells', 'n_counts', 'mean_umi', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'ENS'
uns: 'bin_size', 'bin_type', 'gene_exp_leiden', 'hvg', 'key_record', 'leiden_resolution', 'merged', 'neighbors', 'omics', 'pca_variance_ratio', 'rank_genes_groups', 'resolution', 'result_keys', 'sn', 'log1p'
obsm: 'X_pca', 'X_umap', 'spatial'
obsp: 'connectivities', 'distances'
In [42]:
import scanpy as sc
import pandas as pd
adata_CRC_all.obs['leiden_res_0.2']=adata_CRC.obs['leiden_res_0.2']
# Assume adata_mel_all is your AnnData object
# Step 1: Check cluster labels and gene names
print("Cluster labels:", adata_CRC_all.obs['leiden_res_0.2'].value_counts())
print("First few gene names:", adata_CRC_all.var_names[:5])
# Step 2: Preprocess data
# Check if data is raw counts
print("Sample of raw data:", adata_CRC_all.X[:5, :5].toarray())
# Normalize total counts and logarithmize
sc.pp.normalize_total(adata_CRC_all, target_sum=1e4)
sc.pp.log1p(adata_CRC_all)
print("Sample of normalized, log-transformed data:", adata_CRC_all.X[:5, :5].toarray())
# Step 3: Filter genes starting with "cuTAR"
cuTAR_genes = [gene for gene in adata_CRC_all.var_names if gene.startswith('cuTAR')]
print(f"Number of cuTAR genes: {len(cuTAR_genes)}")
print("First few cuTAR genes:", cuTAR_genes[:5])
# Step 4: Subset AnnData to only cuTAR genes
adata_cuTAR = adata_CRC_all[:, cuTAR_genes].copy()
Cluster labels: 0 17567 1 2796 2 931 Name: leiden_res_0.2, dtype: int64 First few gene names: Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='real_gene_name') Sample of raw data: [[0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.]] WARNING: adata.X seems to be already log-transformed. Sample of normalized, log-transformed data: [[0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.]] Number of cuTAR genes: 172364 First few cuTAR genes: ['cuTAR1', 'cuTAR10', 'cuTAR100', 'cuTAR1000', 'cuTAR10000']
In [32]:
# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.3', groups=['2'], reference='rest', method='wilcoxon')
# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='2')
# Filter for highly expressed genes in cluster 0 and only cuTAR genes
highly_de_cuTAR = de_results[
(de_results['logfoldchanges'] > 0) & # Upregulated in cluster 0
(de_results['pvals_adj'] < 0.05) & # Adjusted p-value < 0.05 for significance
(de_results['names'].str.startswith('cuTAR')) # Only cuTAR genes
].sort_values('logfoldchanges', ascending=False) # Sort by log fold change (highest first)
# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 2:")
print(highly_de_cuTAR[['names', 'logfoldchanges', 'pvals_adj']])
# Optional: Save results to a CSV file
#
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 2:
names logfoldchanges pvals_adj
4 cuTAR48563 1.608611 1.736617e-45
In [35]:
#highly_de_cuTAR.to_csv('de_cuTAR_cluster0_bin50_CRC.csv', index=False)
sc.pl.spatial(adata_CRC_all, color=['cuTAR48563'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5
# Make unique if duplicates exist
In [36]:
sc.pl.dotplot(
adata_CRC_all,
var_names=highly_de_cuTAR['names'],
groupby='leiden_res_0.3',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [43]:
# Step 5: Run differential expression analysis for cluster 0
sc.tl.rank_genes_groups(adata_cuTAR, groupby='leiden_res_0.2', groups=['0'], reference='rest', method='wilcoxon')
# Step 6: Extract DE results
de_results = sc.get.rank_genes_groups_df(adata_cuTAR, group='0')
# Filter for highly expressed genes in cluster 0 and only cuTAR genes
highly_de_cuTAR = de_results[
(de_results['logfoldchanges'] > 0) & # Upregulated in cluster 0
(de_results['pvals_adj'] < 0.05) & # Adjusted p-value < 0.05 for significance
(de_results['names'].str.startswith('cuTAR')) # Only cuTAR genes
].sort_values('logfoldchanges', ascending=False) # Sort by log fold change (highest first)
# Step 7: Display results
print("Highly differentially expressed cuTAR genes in cluster 2:")
print(highly_de_cuTAR[['names', 'logfoldchanges', 'pvals_adj']])
# Optional: Save results to a CSV file
#
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
Highly differentially expressed cuTAR genes in cluster 2:
names logfoldchanges pvals_adj
0 cuTAR48563 2.551227 1.066937e-164
6 cuTAR127 0.810377 1.961647e-07
16 cuTAR81095 0.642386 1.625657e-03
3 cuTAR64 0.494573 5.932891e-09
22 cuTAR167240 0.402884 1.595125e-02
In [44]:
sc.pl.dotplot(
adata_CRC_all,
var_names=highly_de_cuTAR['names'],
groupby='leiden_res_0.2',
standard_scale='var',
color_map='Reds',
show=True, use_raw=False, swap_axes=True
)
/home/s4716765/.local/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
In [46]:
sc.pl.spatial(adata_CRC_all, color=['cuTAR48563','cuTAR127','cuTAR167240'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5
IBD¶
In [98]:
import pickle as pkl
Path_raw_data_2 = "/QRISdata/Q1851/STOmics_SAW_Outputs/IBD_new/"
with open(Path_raw_data_2+"A04627E2_bin20.pkl", "rb") as f:
IBD = pkl.load(f)
IBD
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[98], line 3 1 import pickle as pkl 2 Path_raw_data_2 = "/QRISdata/Q1851/STOmics_SAW_Outputs/IBD_new/" ----> 3 with open(Path_raw_data_2+"A04627E2_bin20.pkl", "rb") as f: 4 IBD = pkl.load(f) 5 IBD File ~/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py:284, in _modified_open(file, *args, **kwargs) 277 if file in {0, 1, 2}: 278 raise ValueError( 279 f"IPython won't let you open fd={file} by default " 280 "as it is likely to crash IPython. If you know what you are doing, " 281 "you can use builtins' open." 282 ) --> 284 return io_open(file, *args, **kwargs) FileNotFoundError: [Errno 2] No such file or directory: '/QRISdata/Q1851/STOmics_SAW_Outputs/IBD_new/A04627E2_bin20.pkl'
In [47]:
sc.pl.spatial(adata_CRC_all, color=['cuTAR176226'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File ~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key, method, tolerance) 3801 try: -> 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:138, in pandas._libs.index.IndexEngine.get_loc() File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:162, in pandas._libs.index.IndexEngine.get_loc() File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:203, in pandas._libs.index.IndexEngine._get_loc_duplicates() File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:211, in pandas._libs.index.IndexEngine._maybe_get_bool_indexer() File ~/.local/lib/python3.8/site-packages/pandas/_libs/index.pyx:107, in pandas._libs.index._unpack_bool_indexer() KeyError: 'cuTAR176226' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[47], line 1 ----> 1 sc.pl.spatial(adata_CRC_all, color=['cuTAR176226'], cmap="inferno", spot_size=70) #reds spot 70, vmax=5 File ~/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:1026, in spatial(adata, basis, img, img_key, library_id, crop_coord, alpha_img, bw, size, scale_factor, spot_size, na_color, show, return_fig, save, **kwargs) 1023 cmap_img = None 1024 circle_radius = size * scale_factor * spot_size * 0.5 -> 1026 axs = embedding( 1027 adata, 1028 basis=basis, 1029 scale_factor=scale_factor, 1030 size=circle_radius, 1031 na_color=na_color, 1032 show=False, 1033 save=False, 1034 **kwargs, 1035 ) 1036 if not isinstance(axs, list): 1037 axs = [axs] File ~/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:259, in embedding(adata, basis, color, gene_symbols, use_raw, sort_order, edges, edges_width, edges_color, neighbors_key, arrows, arrows_kwds, groups, components, dimensions, layer, projection, scale_factor, color_map, cmap, palette, na_color, na_in_legend, size, frameon, legend_fontsize, legend_fontweight, legend_loc, legend_fontoutline, colorbar_loc, vmax, vmin, vcenter, norm, add_outline, outline_width, outline_color, ncols, hspace, wspace, title, show, save, ax, return_fig, **kwargs) 252 # use itertools.product to make a plot for each color and for each component 253 # For example if color=[gene1, gene2] and components=['1,2, '2,3']. 254 # The plots are: [ 255 # color=gene1, components=[1,2], color=gene1, components=[2,3], 256 # color=gene2, components = [1, 2], color=gene2, components=[2,3], 257 # ] 258 for count, (value_to_plot, dims) in enumerate(zip(color, dimensions)): --> 259 color_source_vector = _get_color_source_vector( 260 adata, 261 value_to_plot, 262 layer=layer, 263 use_raw=use_raw, 264 gene_symbols=gene_symbols, 265 groups=groups, 266 ) 267 color_vector, categorical = _color_vector( 268 adata, 269 value_to_plot, (...) 272 na_color=na_color, 273 ) 275 # Order points File ~/.local/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:1195, in _get_color_source_vector(adata, value_to_plot, use_raw, gene_symbols, layer, groups) 1191 value_to_plot = adata.var.index[adata.var[gene_symbols] == value_to_plot][ 1192 0 1193 ] # TODO: Throw helpful error if this doesn't work 1194 if use_raw and value_to_plot not in adata.obs.columns: -> 1195 values = adata.raw.obs_vector(value_to_plot) 1196 else: 1197 values = adata.obs_vector(value_to_plot, layer=layer) File ~/.local/lib/python3.8/site-packages/anndata/_core/raw.py:171, in Raw.obs_vector(self, k) 169 def obs_vector(self, k: str) -> np.ndarray: 170 # TODO decorator to copy AnnData.obs_vector docstring --> 171 idx = self._normalize_indices((slice(None), k)) 172 a = self.X[idx] 173 if issparse(a): File ~/.local/lib/python3.8/site-packages/anndata/_core/raw.py:162, in Raw._normalize_indices(self, packed_index) 160 obs, var = unpack_index(packed_index) 161 obs = _normalize_index(obs, self._adata.obs_names) --> 162 var = _normalize_index(var, self.var_names) 163 return obs, var File ~/.local/lib/python3.8/site-packages/anndata/_core/index.py:72, in _normalize_index(indexer, index) 70 return indexer 71 elif isinstance(indexer, str): ---> 72 return index.get_loc(indexer) # int 73 elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)): 74 if hasattr(indexer, "shape") and ( 75 (indexer.shape == (index.shape[0], 1)) 76 or (indexer.shape == (1, index.shape[0])) 77 ): File ~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py:3804, in Index.get_loc(self, key, method, tolerance) 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: -> 3804 raise KeyError(key) from err 3805 except TypeError: 3806 # If we have a listlike key, _check_indexing_error will raise 3807 # InvalidIndexError. Otherwise we fall through and re-raise 3808 # the TypeError. 3809 self._check_indexing_error(key) KeyError: 'cuTAR176226'
In [50]:
lnc_bin50_all.var
Out[50]:
| real_gene_name | n_cells | n_counts | mean_umi | means | dispersions | dispersions_norm | highly_variable | ENS | |
|---|---|---|---|---|---|---|---|---|---|
| real_gene_name | |||||||||
| TSPAN6 | TSPAN6 | 49 | 82 | 1.673469 | 0.050039 | 5.026673 | 0.563557 | False | ENSG00000000003 |
| TNMD | TNMD | 22 | 42 | 1.909091 | 0.029200 | 5.252887 | 0.912412 | False | ENSG00000000005 |
| DPM1 | DPM1 | 138 | 248 | 1.797101 | 0.152459 | 5.489168 | 1.276791 | False | ENSG00000000419 |
| SCYL3 | SCYL3 | 259 | 450 | 1.737452 | 0.251395 | 5.070881 | -0.563027 | False | ENSG00000000457 |
| C1orf112 | C1orf112 | 420 | 686 | 1.633333 | 0.361828 | 5.072034 | -0.560698 | False | ENSG00000000460 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| cuTAR99983 | cuTAR99983 | 1 | 3 | 3.000000 | 0.001482 | 4.820015 | 0.244860 | False | cuTAR99983 |
| cuTAR99984 | cuTAR99984 | 2 | 2 | 1.000000 | 0.001233 | 4.029741 | -0.973857 | False | cuTAR99984 |
| cuTAR99985 | cuTAR99985 | 3 | 9 | 3.000000 | 0.008704 | 5.941693 | 1.974651 | False | cuTAR99985 |
| cuTAR99986 | cuTAR99986 | 17 | 30 | 1.764706 | 0.021736 | 5.294215 | 0.976146 | False | cuTAR99986 |
| cuTAR99988 | cuTAR99988 | 1 | 1 | 1.000000 | 0.000319 | 3.283414 | -2.124802 | False | cuTAR99988 |
206923 rows × 9 columns
In [53]:
# Check for matching rows in the .var dataframe
lnc_bin50_all.var[lnc_bin50_all.var.index.str.contains("cuTAR176226", case=False)]
Out[53]:
| real_gene_name | n_cells | n_counts | mean_umi | means | dispersions | dispersions_norm | highly_variable | ENS | |
|---|---|---|---|---|---|---|---|---|---|
| real_gene_name |
In [ ]: